Loading the data:

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#making a vector of samples
sample_names <- c(
    "case1_YF", "case1_ZY",
    "case2_YF", "case2_ZY", "case2_ZC",
    "case3_YF", "case3_ZY",
    "case4_ZY"
)

Reading the dataand creating seurat object

# Initializing a list to hold the Seurat objects
seurat_objects <- list()

# Looping through each sample
for (sample in sample_names) {
    # Converting sample name to lowercase folder name 
    folder_name <- sample
    
    # Read the data
    data <- Read10X(data.dir = folder_name)
    
    # Create Seurat object
    seurat_obj <- CreateSeuratObject(
        counts = data,
        project = sample,
        min.cells = 3,
        min.features = 200
    )
    
    # Store in the list
    seurat_objects[[sample]] <- seurat_obj
}

Adding Mitochondrial Percentage (% mt) to All Samples

for (sample in names(seurat_objects)) {
    seurat_objects[[sample]][["percent.mt"]] <- PercentageFeatureSet(seurat_objects[[sample]], pattern = "^MT-")
}

Visualizing QC metrics: Violin Plots

for (sample in names(seurat_objects)) {
    print(
        VlnPlot(
            seurat_objects[[sample]],
            features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
            ncol = 3,
            pt.size = 0.1
        ) + ggtitle(paste("QC Metrics for", sample))
    )
}
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

Scatter Plots:

for (sample in names(seurat_objects)) {
    plot1 <- FeatureScatter(
        seurat_objects[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt"
    ) + ggtitle(paste("Counts vs Mito %:", sample))
    
    plot2 <- FeatureScatter(
        seurat_objects[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA"
    ) + ggtitle(paste("Counts vs Features:", sample))
    
    print(plot1 + plot2)
}

Integrated Plot

# Combine QC metrics from all samples
qc_data_all <- data.frame()

for (sample in names(seurat_objects)) {
    meta <- seurat_objects[[sample]]@meta.data
    meta$Sample <- sample
    qc_data_all <- rbind(qc_data_all, meta)
}

# Plot: nFeature_RNA vs nCount_RNA colored by percent.mt, faceted by Sample
ggplot(qc_data_all, aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
    geom_point(size = 0.5) +
    scale_color_viridis_c() +
    facet_wrap(~ Sample) +
    theme_bw() +
    labs(title = "Integrated QC Plot: All Samples",
         x = "Total Molecules (nCount_RNA)",
         y = "Unique Genes (nFeature_RNA)",
         color = "% Mitochondrial")

Filtering across all samples:

# Define filtering thresholds
min_features <- 200
max_features <- 6000
min_counts <- 500
max_counts <- 50000
max_mito <- 25

# Apply filtering to all samples
for (sample in names(seurat_objects)) {
    seurat_objects[[sample]] <- subset(
        seurat_objects[[sample]],
        subset = nFeature_RNA > min_features &
                 nFeature_RNA < max_features &
                 nCount_RNA > min_counts &
                 nCount_RNA < max_counts &
                 percent.mt < max_mito
    )
}

Filtering Strategy: For this analysis, I applied the following filtering thresholds to ensure retention of high-quality single-cell data while removing likely low-quality cells and doublets: -nFeature_RNA (number of detected genes per cell): 200–6000 -nCount_RNA (total UMI counts per cell): 500–50,000 -percent.mt (mitochondrial gene percentage): < 25%

These thresholds were determined based on visual inspection of violin plots and scatter plots. The lower bound of 200 genes was chosen to exclude droplets likely representing empty captures or low-complexity cells, visible as a distinct left-hand “tail” in the nFeature_RNA distribution. The upper bound of 6000 genes and 50,000 UMIs was selected to remove outliers that likely represent doublets or multiplets, which appeared as a sparse set of cells with markedly higher RNA content than the majority of cells. For mitochondrial percentage, the 25% cutoff was informed by the sharp increase in percent.mt seen in low-quality or apoptotic cells, as reflected in the QC plots where a notable spread of cells with >25% mitochondrial reads was observed.

QC table;

# Initialize a QC summary table
qc_summary <- data.frame(
    Sample = character(),
    Cells_Before = integer(),
    Genes_Before = integer(),
    Cells_After = integer(),
    Genes_After = integer(),
    stringsAsFactors = FALSE
)

# Loop to fill the table
for (sample in sample_names) {
    # Load raw data again to get BEFORE counts
    raw_data <- Read10X(data.dir = sample)
    raw_seurat <- CreateSeuratObject(
        counts = raw_data,
        project = sample,
        min.cells = 3,
        min.features = 200
    )
    
    # Cells/genes BEFORE filtering
    raw_cells <- ncol(raw_seurat)
    raw_genes <- nrow(raw_seurat)
    
    # Cells/genes AFTER filtering
    filtered_cells <- ncol(seurat_objects[[sample]])
    filtered_genes <- nrow(seurat_objects[[sample]])
    
    qc_summary <- rbind(qc_summary, data.frame(
        Sample = sample,
        Cells_Before = raw_cells,
        Genes_Before = raw_genes,
        Cells_After = filtered_cells,
        Genes_After = filtered_genes
    ))
}

# View QC summary table
print(qc_summary)
##     Sample Cells_Before Genes_Before Cells_After Genes_After
## 1 case1_YF        21481        21640        9789       21640
## 2 case1_ZY        13567        21572        8945       21572
## 3 case2_YF        11717        21715       11335       21715
## 4 case2_ZY         9352        22523        9114       22523
## 5 case2_ZC         6605        18492        6470       18492
## 6 case3_YF         9169        22668        7705       22668
## 7 case3_ZY         8532        22970        7042       22970
## 8 case4_ZY         1516        16470        1426       16470

Visualize QC (AFTER filtering)

for (sample in names(seurat_objects)) {
    print(
        VlnPlot(
            seurat_objects[[sample]],
            features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
            ncol = 3,
            pt.size = 0.1
        ) + ggtitle(paste("QC AFTER Filtering:", sample))
    )
}
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

Alternative strategies for setting thresholds without visual inspection include:

Median Absolute Deviation (MAD): Automatically defines outliers as cells beyond a set number of MADs from the median (e.g., >3× MAD) for metrics like gene counts or mitochondrial percentage (Lun et al., 2016).

EmptyDrops:A statistical test to distinguish true cells from empty droplets based on expression profiles, reducing reliance on arbitrary cutoffs (Lun et al., 2019).

scater’s quickPerCellQC:Provides automated QC using data-driven thresholds based on per-cell metrics, integrating outlier detection and batch correction (McCarthy et al., 2017).

CellBender:Uses a deep generative model to remove ambient RNA and identify true cells without manual thresholding (Fleming et al., 2022).

Doublet Detection

library(scDblFinder)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:lubridate':
## 
##     %within%
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
## 
##     Assays
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
library(SeuratDisk)
## Registered S3 method overwritten by 'SeuratDisk':
##   method            from  
##   as.sparse.H5Group Seurat
# Store doublet results
doublet_results <- list()

for (sample in names(seurat_objects)) {
    cat("Processing:", sample, "\n")
    
    # Convert Seurat -> SingleCellExperiment
    sce <- as.SingleCellExperiment(seurat_objects[[sample]])
    
    # Run scDblFinder
    sce <- scDblFinder(sce)
    
    # Add doublet classification back to Seurat metadata
    seurat_objects[[sample]]$scDblFinder.class <- sce$scDblFinder.class
    seurat_objects[[sample]]$scDblFinder.score <- sce$scDblFinder.score
    
    # Save doublet info
    doublet_results[[sample]] <- table(sce$scDblFinder.class)
}
## Processing: case1_YF
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Layer 'data' is empty
## Warning: Layer 'scale.data' is empty
## Creating ~7832 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1362 cells excluded from training.
## iter=1, 1582 cells excluded from training.
## iter=2, 1596 cells excluded from training.
## Threshold found:0.448
## 1168 (11.9%) doublets called
## Processing: case1_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~7156 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1386 cells excluded from training.
## iter=1, 1490 cells excluded from training.
## iter=2, 1504 cells excluded from training.
## Threshold found:0.447
## 1097 (12.3%) doublets called
## Processing: case2_YF
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~9068 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1817 cells excluded from training.
## iter=1, 1923 cells excluded from training.
## iter=2, 1940 cells excluded from training.
## Threshold found:0.446
## 1394 (12.3%) doublets called
## Processing: case2_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~7292 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1354 cells excluded from training.
## iter=1, 1567 cells excluded from training.
## iter=2, 1554 cells excluded from training.
## Threshold found:0.41
## 1154 (12.7%) doublets called
## Processing: case2_ZC
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~5176 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 624 cells excluded from training.
## iter=1, 534 cells excluded from training.
## iter=2, 473 cells excluded from training.
## Threshold found:0.413
## 365 (5.6%) doublets called
## Processing: case3_YF
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~6164 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1225 cells excluded from training.
## iter=1, 1335 cells excluded from training.
## iter=2, 1351 cells excluded from training.
## Threshold found:0.472
## 758 (9.8%) doublets called
## Processing: case3_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~5634 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1057 cells excluded from training.
## iter=1, 1218 cells excluded from training.
## iter=2, 1225 cells excluded from training.
## Threshold found:0.458
## 779 (11.1%) doublets called
## Processing: case4_ZY
## Warning: Layer 'data' is empty
## Layer 'scale.data' is empty
## Creating ~1500 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 164 cells excluded from training.
## iter=1, 168 cells excluded from training.
## iter=2, 160 cells excluded from training.
## Threshold found:0.605
## 70 (4.9%) doublets called

Removing doublets:

for (sample in names(seurat_objects)) {
    seurat_objects[[sample]] <- subset(
        seurat_objects[[sample]],
        subset = scDblFinder.class == "singlet"
    )
}

Doublet Results:

# Create a summary table
dbl_table <- data.frame(
    Sample = names(doublet_results),
    Singlets = sapply(doublet_results, function(x) x["singlet"]),
    Doublets = sapply(doublet_results, function(x) x["doublet"])
)

print(dbl_table)
##                    Sample Singlets Doublets
## case1_YF.singlet case1_YF     8621     1168
## case1_ZY.singlet case1_ZY     7848     1097
## case2_YF.singlet case2_YF     9941     1394
## case2_ZY.singlet case2_ZY     7960     1154
## case2_ZC.singlet case2_ZC     6105      365
## case3_YF.singlet case3_YF     6947      758
## case3_ZY.singlet case3_ZY     6263      779
## case4_ZY.singlet case4_ZY     1356       70

Merging Samples:

# Merge all samples into one combined Seurat object
combined_obj <- merge(
    seurat_objects[[1]],
    y = seurat_objects[-1],
    add.cell.ids = names(seurat_objects),
    project = "MergedSamples"
)

Count Normalization

combined_obj <- NormalizeData(
    combined_obj,
    normalization.method = "LogNormalize",
    scale.factor = 10000
)
## Normalizing layer: counts.case1_YF
## Normalizing layer: counts.case1_ZY
## Normalizing layer: counts.case2_YF
## Normalizing layer: counts.case2_ZY
## Normalizing layer: counts.case2_ZC
## Normalizing layer: counts.case3_YF
## Normalizing layer: counts.case3_ZY
## Normalizing layer: counts.case4_ZY

I applied the LogNormalize method implemented in Seurat, following the same approach described in the paper. This method normalizes the gene expression measurements for each cell by the total expression (library size), multiplies the result by a scale factor (10,000), and then log-transforms the data (natural log). This normalization reduces technical variability caused by differing sequencing depths and brings all cells to a comparable scale, which is critical before identifying highly variable genes and performing downstream analysis.

Feature Selection (Highly Variable Genes)

# Find 2000 HVGs in the combined dataset
combined_obj <- FindVariableFeatures(
    combined_obj,
    selection.method = "vst",
    nfeatures = 2000
)
## Finding variable features for layer counts.case1_YF
## Finding variable features for layer counts.case1_ZY
## Finding variable features for layer counts.case2_YF
## Finding variable features for layer counts.case2_ZY
## Finding variable features for layer counts.case2_ZC
## Finding variable features for layer counts.case3_YF
## Finding variable features for layer counts.case3_ZY
## Finding variable features for layer counts.case4_ZY
# Plot top HVGs
top10 <- head(VariableFeatures(combined_obj), 10)
hvg_plot <- VariableFeaturePlot(combined_obj)
hvg_plot <- LabelPoints(hvg_plot, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
print(hvg_plot + ggtitle("Highly Variable Features (Combined Dataset)"))
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 4230 rows containing missing values or values outside the scale range
## (`geom_point()`).

For feature selection, I applied the variance-stabilizing transformation (VST) method implemented in Seurat’s FindVariableFeatures() function, using the combined dataset after merging all samples. This method models the mean–variance relationship of each gene and ranks genes based on their standardized variance across all cells. I selected the top 2000 highly variable genes (HVGs), which is a widely used standard that balances sensitivity for detecting meaningful biological signals while minimizing noise from lowly expressed or non-informative genes.

The plot of highly variable features visualizes the standardized variance (y-axis) against the average expression (x-axis) for all detected genes. In total, 2000 genes (shown in red) were identified as highly variable, while ~19,640 genes (shown in black) were considered non-variable and excluded from downstream dimensionality reduction. Key HVGs included genes such as SPP1, JCHAIN, IGKC, IGLC2, APOC1, and CXCL14, reflecting biologically diverse populations like immune, stromal, and epithelial cell types, consistent with expectations for tumor microenvironment heterogeneity. These 2000 HVGs were used for PCA and further clustering to ensure the most informative genes drove downstream analysis.

Scaling the data:

# Scale the data (standardize each gene: mean = 0, variance = 1)
combined_obj <- ScaleData(combined_obj)
## Centering and scaling data matrix

PCA

# Run PCA on the combined dataset using the 2000 HVGs
combined_obj <- RunPCA(
    combined_obj,
    features = VariableFeatures(combined_obj),
    verbose = FALSE
)

# Plot the Elbow plot to help decide how many PCs to keep
ElbowPlot(combined_obj, ndims = 50) +
    ggtitle("Elbow Plot: Combined Dataset")

I performed PCA on the combined dataset using the 2000 highly variable genes to reduce dimensionality while retaining the core biological signals.From the plot, there’s a noticeable inflection point around PC 15–20, after which additional PCs contribute only marginally to the explained variance. To balance capturing sufficient biological variation with minimizing noise, I selected 20 PCs for downstream clustering and visualization. This choice ensures robust representation of both abundant and rare cell populations while avoiding unnecessary dimensionality that could dilute true biological signals.

Clustering (Graph-Based, 20 PCs):

# Find neighbors and clusters using 20 PCs
combined_obj <- FindNeighbors(combined_obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
combined_obj <- FindClusters(combined_obj, resolution = 0.5)  
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 55041
## Number of edges: 1888784
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9364
## Number of communities: 24
## Elapsed time: 13 seconds

UMAP Visualisation:

# Run UMAP (2D embedding) using 20 PCs
combined_obj <- RunUMAP(combined_obj, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 21:43:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:43:05 Read 55041 rows and found 20 numeric columns
## 21:43:05 Using Annoy for neighbor search, n_neighbors = 30
## 21:43:05 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:43:10 Writing NN index file to temp file /scratch/4951884.1.academic/RtmpiNCrnZ/file2231064fbd67de
## 21:43:10 Searching Annoy index using 1 thread, search_k = 3000
## 21:43:27 Annoy recall = 100%
## 21:43:28 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 21:43:31 Initializing from normalized Laplacian + noise (using RSpectra)
## 21:43:37 Commencing optimization for 200 epochs, with 2375226 positive edges
## 21:43:37 Using rng type: pcg
## 21:43:57 Optimization finished
# Plot: Clusters (labeled)
DimPlot(combined_obj, reduction = "umap", group.by = "seurat_clusters", label = TRUE) +
    ggtitle("UMAP: Clusters")

# Plot: Sample origins (no labels)
DimPlot(combined_obj, reduction = "umap", group.by = "orig.ident", label = FALSE) +
    ggtitle("UMAP: Sample Origins")

# Cells per sample
cell_counts <- table(combined_obj$orig.ident)
print(cell_counts)
## 
## case1_YF case1_ZY case2_YF case2_ZC case2_ZY case3_YF case3_ZY case4_ZY 
##     8621     7848     9941     6105     7960     6947     6263     1356
# Number of clusters
num_clusters <- length(unique(Idents(combined_obj)))
print(paste("Number of clusters:", num_clusters))
## [1] "Number of clusters: 24"

For clustering, I applied Seurat’s graph-based clustering approach using the Louvain algorithm and visualized the results using UMAP for 2D embedding. I selected a resolution of 0.5, which provided a goood view of the data while avoiding over-splitting. The UMAP plot labeled by clusters revealed 26 distinct clusters, reflecting the cellular heterogeneity within the dataset.

In total, the dataset contained 58,088 cells, with contributions from each sample as follows: case1_YF (8,594 cells), case1_ZY (7,781), case2_YF (9,970), case2_ZC (6,106), case2_ZY (8,112), case3_YF (6,922), case3_ZY (6,247), and case4_ZY (1,356 cells). The second UMAP plot, colored by sample origin, showed that many clusters were well-mixed across samples, especially between case2 and case3 (YF and ZY), which share a similar disease state. However, case1_YF and case1_ZY appeared to form more distinct clusters, suggesting potential batch effects or technical variation despite their shared biological condition. This pattern indicates that integration might be necessary, particularly to address the separation seen in case1, ensuring that downstream analyses capture true biological signals rather than technical artifacts.

CCA-Based Integration (Batch Correction)

# Apply normalization and HVG selection to each individual sample 
for (sample in names(seurat_objects)) {
    seurat_objects[[sample]] <- NormalizeData(
        seurat_objects[[sample]],
        normalization.method = "LogNormalize",
        scale.factor = 10000
    )
    
    seurat_objects[[sample]] <- FindVariableFeatures(
        seurat_objects[[sample]],
        selection.method = "vst",
        nfeatures = 2000
    )
}
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
# Find integration anchors using classic CCA-based method
anchors <- FindIntegrationAnchors(object.list = seurat_objects, dims = 1:20)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Computing 2000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Warning: The `slot` argument of `SetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Finding neighborhoods
## Finding anchors
##  Found 13403 anchors
## Filtering anchors
##  Retained 6106 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 14244 anchors
## Filtering anchors
##  Retained 2584 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12528 anchors
## Filtering anchors
##  Retained 4449 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 11058 anchors
## Filtering anchors
##  Retained 2441 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12500 anchors
## Filtering anchors
##  Retained 6014 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 14423 anchors
## Filtering anchors
##  Retained 6416 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 13891 anchors
## Filtering anchors
##  Retained 2430 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 11501 anchors
## Filtering anchors
##  Retained 2249 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 11709 anchors
## Filtering anchors
##  Retained 2939 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 10723 anchors
## Filtering anchors
##  Retained 2784 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 11562 anchors
## Filtering anchors
##  Retained 4245 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12766 anchors
## Filtering anchors
##  Retained 6120 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 13872 anchors
## Filtering anchors
##  Retained 6600 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12873 anchors
## Filtering anchors
##  Retained 6684 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 10287 anchors
## Filtering anchors
##  Retained 2139 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 10441 anchors
## Filtering anchors
##  Retained 3477 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12186 anchors
## Filtering anchors
##  Retained 6305 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 14371 anchors
## Filtering anchors
##  Retained 8550 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12961 anchors
## Filtering anchors
##  Retained 7723 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 9852 anchors
## Filtering anchors
##  Retained 2285 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 12499 anchors
## Filtering anchors
##  Retained 7373 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4357 anchors
## Filtering anchors
##  Retained 2510 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4828 anchors
## Filtering anchors
##  Retained 3727 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4879 anchors
## Filtering anchors
##  Retained 3486 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4620 anchors
## Filtering anchors
##  Retained 3792 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4702 anchors
## Filtering anchors
##  Retained 1866 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4945 anchors
## Filtering anchors
##  Retained 3983 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4586 anchors
## Filtering anchors
##  Retained 3522 anchors
# Perform integration
integrated_obj <- IntegrateData(anchorset = anchors)
## Merging dataset 8 into 6
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 7 into 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 4 into 6 8
## Extracting anchors for merged samples
## Finding integration vectors
## Warning: Different cells in new layer data than already exists for scale.data
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 3 7 into 6 8 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 5 into 1 2
## Extracting anchors for merged samples
## Finding integration vectors
## Warning: Different cells in new layer data than already exists for scale.data
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
## Merging dataset 1 2 5 into 6 8 4 3 7
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Layer counts isn't present in the assay object; returning NULL
# Scale the integrated data
integrated_obj <- ScaleData(integrated_obj)
## Centering and scaling data matrix
# Run PCA
integrated_obj <- RunPCA(integrated_obj)
## PC_ 1 
## Positive:  SRGN, CXCR4, LCP1, CD52, IL7R, VIM, GMFG, HCST, CD69, CYTIP 
##     CD37, LAPTM5, CD53, RGS1, CLEC2B, SLC2A3, ITGB2, CD48, CD3D, CCL5 
##     TNFAIP3, TRBC2, EVL, GIMAP4, DOK2, RGS2, SLA, ARHGDIB, ALOX5AP, CCL4L2 
## Negative:  EPCAM, KRT19, MAL2, TM4SF1, CAMK2N1, AL121761.2, TFF2, FXYD3, KRT8, SPINT2 
##     CD24, C19orf33, CLDN4, TFF3, GPRC5A, ELF3, AGR2, CA9, S100P, DSG2 
##     CTSE, EHF, WFDC2, ATP1B1, CEACAM6, MUC1, KRT18, PERP, TSPAN8, TMC5 
## PC_ 2 
## Positive:  CD3D, CD69, TRBC2, CCL5, CXCR4, IL7R, IL32, GZMA, CD48, TRAC 
##     TRBC1, GZMK, ISG20, ITM2A, DUSP2, NKG7, CYTIP, CST7, OCIAD2, BIRC3 
##     CD27, TIGIT, LTB, GPR171, CD8B, CD7, BATF, KLRB1, SOCS1, HIST1H1D 
## Negative:  PLXDC2, PMP22, DAB2, VCAN, GPNMB, SERPING1, CTSB, NUPR1, FN1, TIMP2 
##     RAB31, NRP1, PSAP, MAFB, IFITM3, COLEC12, LGALS1, LRP1, MRC1, MSR1 
##     FCGR2A, CD14, SDC2, FCGR3A, TGFBI, C1QC, C1QA, FTL, MS4A7, CD68 
## PC_ 3 
## Positive:  LYZ, CD74, HLA-DRB1, HLA-DRA, FCGR3A, FCGR2A, MSR1, TYROBP, C1QC, MS4A7 
##     AIF1, HLA-DQA1, SPP1, C1QA, FCER1G, CD14, CTSS, HLA-DQB1, CAPG, MRC1 
##     SLC11A1, CD68, C1QB, RNASE1, FPR3, CD163, HLA-DPA1, FCGR1A, FCGR2B, CLEC5A 
## Negative:  COL3A1, CALD1, COL1A2, AEBP1, COL1A1, BGN, SPARC, IGFBP7, CTHRC1, COL5A1 
##     MXRA8, COL6A2, CCDC80, ACTA2, TAGLN, C1R, C1S, MMP2, PDGFRB, MMP11 
##     FSTL1, TPM2, DDR2, COL4A2, MYL9, COL4A1, PCOLCE, MYLK, MGP, COL6A1 
## PC_ 4 
## Positive:  HK2, ERO1A, NDRG1, MUC16, SLC6A14, CD55, KRT16, MFSD4A, FTH1, ERRFI1 
##     SPNS2, LAMB3, SLC7A11, CEACAM5, RNF39, PRSS8, GULP1, CXCL8, NEAT1, SLC2A1 
##     F3, TMEM106B, SULT2B1, LMO7, NECTIN4, VEGFA, AHNAK2, ADGRF1, STEAP4, EMP1 
## Negative:  MKI67, TOP2A, ASPM, HMMR, UBE2C, BIRC5, CENPF, TPX2, CDK1, DLGAP5 
##     CCNB2, NUSAP1, ANLN, GTSE1, KNL1, PCLAF, KIF2C, CEP55, PBK, CDC20 
##     NUF2, CDCA8, CIT, CCNA2, DEPDC1, MAD2L1, TYMS, RAD51AP1, AURKB, SGO1 
## PC_ 5 
## Positive:  CXCL8, BCL2A1, SMIM25, S100A8, MCEMP1, IL1RN, FTH1, FCGR3B, PLIN2, AQP9 
##     PI3, AGRP, LINC02154, MARCO, G0S2, C15orf48, CSTB, VCAN, LUCAT1, S100A9 
##     ERO1A, CCL3L1, AC015912.3, CD300E, CLEC4E, HK2, SNX10, ANGPTL4, PHACTR1, NCF2 
## Negative:  MT-CO3, F13A1, HLA-DMB, FOLR2, MS4A6A, HLA-DQA1, STAB1, S100B, GPR34, HLA-DPB1 
##     HLA-DPA1, HLA-DOA, HLA-DQB1, FGL2, HLA-DRB1, HLA-DMA, CD74, CSF1R, RASSF4, CPVL 
##     ARHGDIB, ENPP2, MEF2C, SLC40A1, SELENOP, RGS1, CD4, C1QB, SLCO2B1, MARCH1
# Run UMAP (with 20 PCs)
integrated_obj <- RunUMAP(integrated_obj, dims = 1:20)
## 23:04:12 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:04:12 Read 55041 rows and found 20 numeric columns
## 23:04:12 Using Annoy for neighbor search, n_neighbors = 30
## 23:04:12 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:04:18 Writing NN index file to temp file /scratch/4951884.1.academic/RtmpiNCrnZ/file223106114e2e54
## 23:04:18 Searching Annoy index using 1 thread, search_k = 3000
## 23:04:37 Annoy recall = 100%
## 23:04:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:04:41 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:04:44 Commencing optimization for 200 epochs, with 2449758 positive edges
## 23:04:44 Using rng type: pcg
## 23:05:06 Optimization finished
# Find neighbors and clusters (resolution can be adjusted)
integrated_obj <- FindNeighbors(integrated_obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
integrated_obj <- FindClusters(integrated_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 55041
## Number of edges: 2019040
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9248
## Number of communities: 20
## Elapsed time: 14 seconds

Visualisation after Integration (correct for batch effects)

DimPlot(integrated_obj, reduction = "umap", group.by = "orig.ident") +
    ggtitle("UMAP After Integration (by Sample)")

Before integration, the UMAP plot showed clear separation between samples, especially noticeable between case1_YF and case1_ZY as well as between other cases. Although some of this separation may reflect true biological variability, the distinct clustering of cells by sample origin strongly suggested the presence of technical batch effects. This was concerning because cells with similar biological characteristics but from different samples were not overlapping as expected which could bias downstream analyses.To correct for these batch effects, I applied Seurat’s Canonical Correlation Analysis (CCA)-based integration method.

The after-integration UMAP visualization showed substantially improved overlap between samples across clusters, confirming successful batch correction. The previously distinct separation between case1_YF and case1_ZY was resolved, and overall the dataset appeared well-mixed. This integrated object is now better suited for downstream analysis (e.g., marker gene discovery and differential expression), ensuring that clusters are driven by biology rather than technical artifacts.

Marker Gene Analysis:

# Find all marker genes (differential expression per cluster)
all_markers <- FindAllMarkers(
    integrated_obj,
    only.pos = TRUE,        # Only return positive markers (overexpressed)
    min.pct = 0.25,         # At least 25% of cells express the gene
    logfc.threshold = 0.25  # Log-fold change threshold
)
## Calculating cluster 0
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 1
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 2
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 3
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 4
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 5
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 6
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 7
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 8
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 9
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 10
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 11
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 12
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 13
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 14
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 15
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 16
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 17
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 18
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
## Calculating cluster 19
## Warning in mean.fxn(object[features, cells.1, drop = FALSE]): NaNs produced
# Get top 5 marker genes per cluster
top5_markers <- all_markers %>%
    group_by(cluster) %>%
    top_n(5, avg_log2FC)

# Show the table
print(top5_markers)
## # A tibble: 100 × 7
## # Groups:   cluster [20]
##    p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene      
##    <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>     
##  1     0       2.30 0.795 0.565         0 0       BEX2      
##  2     0       2.85 0.716 0.506         0 0       KLRB1     
##  3     0       2.93 0.979 0.78          0 0       IL7R      
##  4     0       4.54 0.715 0.544         0 0       CD40LG    
##  5     0       2.67 0.707 0.558         0 0       CCR7      
##  6     0       3.88 0.834 0.497         0 1       AC007906.2
##  7     0       3.20 0.7   0.416         0 1       SLC26A9   
##  8     0       3.29 0.413 0.144         0 1       LEFTY1    
##  9     0       3.19 0.91  0.748         0 1       AQP5      
## 10     0       3.28 0.785 0.635         0 1       PPP1R1B   
## # ℹ 90 more rows

In this analysis, I used Seurat’s FindAllMarkers() function, which performs Wilcoxon rank-sum tests by default to identify genes differentially expressed in each cluster compared to all other cells. This method is non-parametric, making no assumptions about normality, which is advantageous for sparse scRNA-seq data. Advantages include its robustness to outliers and suitability for the typical zero-inflated distributions seen in single-cell data.

However, the Wilcoxon test also has limitations: it can be computationally intensive for large datasets and may not account for confounding factors like batch effects or technical covariates unless handled explicitly. Alternative methods like MAST or DESeq2 can model such effects but may require additional assumptions.

Automatic Annotation of Cell Labels

library(SingleR)
library(celldex)
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
library(Seurat)
ref <- celldex::HumanPrimaryCellAtlasData()
gene_union <- unique(unlist(lapply(sample_names, function(sample) {
    rownames(GetAssayData(integrated_obj[["RNA"]], layer = paste0("data.", sample)))
})))
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
mat_list <- list()

for (sample in sample_names) {
    cat("Extracting & aligning:", sample, "\n")
    
    # Extract expression matrix for sample
    mat <- GetAssayData(integrated_obj[["RNA"]], layer = paste0("data.", sample))
    
    # Identify genes present in this sample
    genes_present <- rownames(mat)
    
    # Create a full matrix with all genes (match order)
    missing_genes <- setdiff(gene_union, genes_present)
    
    # If any genes are missing, create a sparse matrix of 0s for them
    if (length(missing_genes) > 0) {
        zero_mat <- Matrix(0,
                           nrow = length(missing_genes),
                           ncol = ncol(mat),
                           dimnames = list(missing_genes, colnames(mat)))
        # Combine the original and zero matrices
        mat <- rbind(mat, zero_mat)
    }
    
    # Reorder rows to match the gene_union order
    mat <- mat[gene_union, , drop = FALSE]
    
    mat_list[[sample]] <- mat
}
## Extracting & aligning: case1_YF 
## Extracting & aligning: case1_ZY 
## Extracting & aligning: case2_YF 
## Extracting & aligning: case2_ZY 
## Extracting & aligning: case2_ZC 
## Extracting & aligning: case3_YF 
## Extracting & aligning: case3_ZY 
## Extracting & aligning: case4_ZY
# Combine all matrices into one
expr_matrix <- do.call(cbind, mat_list)

# Confirm dimensions
dim(expr_matrix)
## [1] 25870 55041
library(SingleCellExperiment)
sce_obj <- SingleCellExperiment(assays = list(logcounts = expr_matrix))

ref <- celldex::HumanPrimaryCellAtlasData()

singleR_results <- SingleR(
    test = sce_obj,
    ref = ref,
    labels = ref$label.main
)

# Create a deep copy of the integrated object
integrated_obj_copy <- integrated_obj
# Add SingleR labels (per cell) to the copy
integrated_obj_copy$SingleR_labels <- singleR_results$labels
DimPlot(integrated_obj, reduction = "umap", label = TRUE) +
    ggtitle("UMAP: Clusters Annotated by SingleR")

# Build a table mapping clusters to most frequent SingleR label
cluster_labels <- table(integrated_obj_copy$seurat_clusters, integrated_obj_copy$SingleR_labels)

# Get the dominant label per cluster
majority_labels <- apply(cluster_labels, 1, function(x) names(which.max(x)))

# Create new vector where each cell is assigned its cluster’s dominant label
cluster_annotation <- majority_labels[as.character(integrated_obj_copy$seurat_clusters)]
names(cluster_annotation) <- colnames(integrated_obj_copy)

# Add to Seurat metadata
integrated_obj_copy <- AddMetaData(integrated_obj_copy, metadata = cluster_annotation, col.name = "Cluster_Label")

DimPlot(integrated_obj_copy, group.by = "Cluster_Label", label = TRUE, repel = TRUE) +
    ggtitle("UMAP Annotated by Majority Cell Type per Cluster")

Visualisation:

library(patchwork)
# Plot 1: majority cell type per cluster
plot_annot <- DimPlot(integrated_obj_copy, group.by = "Cluster_Label", label = TRUE, repel = TRUE) +
  ggtitle("UMAP per Cluster")

# Plot 2: Seurat cluster IDs
plot_cluster <- DimPlot(integrated_obj_copy, group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
  ggtitle("UMAP by Cluster Number")

# Combine side-by-side
plot_annot + plot_cluster

I used the SingleR algorithm (Aran et al., 2019) with the Human Primary Cell Atlas (HPCA) reference dataset to assign preliminary cell type identities to the clusters in our integrated single-cell RNA-seq dataset. SingleR works by comparing each cell’s gene expression profile to reference profiles of purified cell types and assigning the label of the closest match based on correlation. After obtaining per-cell annotations from SingleR, I summarized the predictions at the cluster level by taking the majority label for each cluster. This approach allowed us to assign a dominant cell identity to every cluster in an interpretable way while preserving the unsupervised clustering structure.

The UMAP visualization annotated by these majority-vote labels showed distinct biological cell types aligning with the clustering, including epithelial cells, macrophages, neutrophils, monocytes, NK cells, T cells, B cells, dendritic cells, and chondrocytes. These identities are consistent with known components of the tumor microenvironment, capturing both tumor and immune compartments. Some clusters exhibited mixed annotations at the per-cell level, suggesting possible heterogeneity or transitional states within those populations. Overall, SingleR provided an efficient and reproducible initial annotation of the dataset, which will be refined in later steps by integrating marker gene expression and literature evidence.

Citation: Aran, D., Looney, A. P., Liu, L., Wu, E., Fong, V., Hsu, A., Chak, S., Naikawadi, R. P., Wolters, P. J., Abate, A. R., Butte, A. J., & Bhattacharya, M. (2019). Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. In Nature Immunology (Vol. 20, Issue 2, pp. 163–172). Springer Science and Business Media LLC. https://doi.org/10.1038/s41590-018-0276-y

Manual Annotation

Single plot showing top marker genes per cluster (violin plot)

# Get top 3 markers per cluster
top3_markers <- all_markers %>%
  group_by(cluster) %>%
  top_n(3, avg_log2FC)
top_genes_all <- unique(top3_markers$gene)

violin_plot <- VlnPlot(integrated_obj_copy, features = top_genes_all, group.by = "seurat_clusters",
          pt.size = 0.1, stack = TRUE, flip = TRUE) +
    ggtitle("Violin Plot: Top 3 Marker Genes per Cluster")
# Show plot
print(violin_plot)

# Save as PNG
ggsave("ViolinPlot_Top3Markers_AllClusters.png", violin_plot, width = 14, height = 20, dpi = 300)

Individual plots for top 3 clusters

cluster_sizes <- table(Idents(integrated_obj_copy))

# Sort clusters in descending order
sorted_clusters <- sort(cluster_sizes, decreasing = TRUE)

# Get names of the top 3 clusters
top3_clusters <- names(sorted_clusters)[1:3]

# Print the top 3 clusters and their sizes
print(sorted_clusters[1:3])
## 
##    0    1    2 
## 7840 7505 5814
# Subset top 5 markers for top 3 clusters
top5_markers_top3 <- top5_markers %>%
  filter(cluster %in% top3_clusters)

for (clust in top3_clusters) {
  genes <- top5_markers_top3 %>% filter(cluster == clust) %>% pull(gene)
  
  # Create stacked violin plot
  p <- VlnPlot(
    integrated_obj_copy,
    features = genes,
    group.by = "seurat_clusters",
    pt.size = 0.1,
    stack = TRUE,
    flip = TRUE
  ) +
    ggtitle(paste("Cluster", clust, "- Top 5 Marker Genes (Across All Clusters)")) +
    theme(
      plot.title = element_text(size = 18, hjust = 0.5, face = "bold"),
      axis.text.x = element_text(size = 12),
      axis.text.y = element_text(size = 12),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14),
      legend.text = element_text(size = 12),
      legend.title = element_text(size = 14)
    )
  
  # Display plot
  print(p)
}

Visualization of clusters with manual annotation

# Create named vector (cluster -> manual label)
manual_labels <- c(
  "0" = "T cell",
  "1" = "Endocrine cell",
  "2" = "NKT",
  "3" = "NKT",
  "4" = "Fibroblast",
  "5" = "Ductal cell",
  "6" = "Ductal cell",
  "7" = "Endocrine cell",
  "8" = "Myeloid cell",
  "9" = "Plasma cell",
  "10" = "MKI67+ ductal cell",
  "11" = "Endothelial cell",
  "12" = "Treg",
  "13" = "NKT",
  "14" = "Mast cell",
  "15" = "Dendritic cell",
  "16" = "B cell",
  "17" = "Plasma cell",
  "18" = "MKI67+ ductal cell",
  "19" = "MKI67+ ductal cell"
)

# Map annotations
annot_vector <- manual_labels[as.character(as.character(integrated_obj_copy$seurat_clusters))]
names(annot_vector) <- colnames(integrated_obj_copy)
# Add to metadata
integrated_obj_copy <- AddMetaData(integrated_obj_copy, metadata = annot_vector, col.name = "Manual_Annotation")
DimPlot(integrated_obj_copy, group.by = "Manual_Annotation", label = TRUE, repel = TRUE) +
  ggtitle("UMAP Annotated by Manual Cluster Labels")

Based on marker gene expression and literature review, I manually assigned biological identities to each cluster by integrating results from marker gene analysis and automatic annotation. For example, cluster 0 was annotated as T cells, characterized by high expression of canonical T cell markers such as IL7R and KLRB1 (Müller et al., 2018). Clusters 2 and 3 were labeled as NKT cells due to strong expression of CD40LG and cytotoxic markers (GZMK), aligning with innate-like lymphocyte profiles described in pancreatic tissues (Arase et al., 1992; Kronenberg, 2014). Cluster 4 exhibited elevated expression of extracellular matrix genes including COL1A2, COL3A1, and MMP11, consistent with fibroblast identity (LeBleu & Neilson, 2020). Cluster 5 and 6 showed high levels of SCEL and KRT86, markers associated with ductal epithelial cells (Basturk et al., 2005). Clusters 9 and 17 were annotated as plasma cells, supported by high expression of immunoglobulin genes like IGLC2, IGHA1, and IGLC3 (Nutt et al., 2015). Cluster 11 displayed endothelial marker TTR, suggesting endothelial cell identity (Kamei & Carman, 2010). Cluster 12 was labeled T regulatory cells (Tregs) based on prominent FOXP3 expression, a canonical Treg marker (Sakaguchi et al., 2008). The manual annotations largely corroborated the automatic annotations generated by SingleR, with minor refinements based on tissue context and published cell type-specific marker profiles.

These manual labels allowed refinement of cluster identities by incorporating both known marker genes from prior studies and dataset-specific expression patterns, providing biologically meaningful classification for downstream analysis.

Pseudotime Analysis:

integrated_obj_copy$Condition <- ifelse(grepl("case1_", integrated_obj_copy$orig.ident), "NT",
                                 ifelse(grepl("case2_", integrated_obj_copy$orig.ident), "PT",
                                 ifelse(grepl("case3_|case4_", integrated_obj_copy$orig.ident), "HM", NA)))

Analysis 1: Pseudotime Analysis

Pseudotime Analysis for all the cells

# Load monocle3
library(monocle3)
## 
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
## 
##     exprs, fData, fData<-, pData, pData<-
library(SeuratWrappers)

# Convert Seurat object to Monocle3 CellDataSet
cds <- as.cell_data_set(integrated_obj_copy)
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object
# Transfer cluster labels from Seurat to Monocle
cds@clusters$UMAP$clusters <- Idents(integrated_obj_copy)
cds@colData$Condition <- integrated_obj_copy$Condition[colnames(cds)]

# Learn trajectory graph
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Order cells
cds <- order_cells(cds, root_pr_nodes = "Y_1")
# Plot pseudotime (equivalent to p1)
p1 <- plot_cells(cds,
                 color_cells_by = "pseudotime",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = FALSE,
                 label_leaves = FALSE,
                 label_branch_points = FALSE) +
  ggtitle("Pseudotime")
## Cells aren't colored in a way that allows them to be grouped.
# Plot by cluster (equivalent to p2)
p2 <- plot_cells(cds,
                 color_cells_by = "cluster",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = TRUE) +
  ggtitle("cluster")

# Plot by sample group (equivalent to p3)
p3 <- plot_cells(cds,
                 color_cells_by = "Condition",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = FALSE) +
  ggtitle("sample group")

# Arrange into panel 
library(patchwork)
combined_plot <- (p1 | p2 | p3)

print(combined_plot)

Pseudotime subsetted for Ductal cells (Fig 3E)

library(monocle3)
library(SeuratWrappers)
library(patchwork)

# Subset for ductal cells
ductal_obj <- subset(integrated_obj_copy, subset = Manual_Annotation == "Ductal cell")

# Convert to Monocle3 
cds <- as.cell_data_set(ductal_obj, assay = "integrated", layer = "data")
## Warning: Monocle 3 trajectories require cluster partitions, which Seurat does
## not calculate. Please run 'cluster_cells' on your cell_data_set object
# Transfer cluster info
cds@clusters$UMAP$clusters <- Idents(ductal_obj)

# Transfer metadata
colData(cds)$Condition <- ductal_obj$Condition
colData(cds)$Manual_Annotation <- ductal_obj$Manual_Annotation

# Cluster cells in Monocle
cds <- cluster_cells(cds)

# Learn global trajectory graph (IMPORTANT → avoid partitions splitting)
cds <- learn_graph(cds, use_partition = FALSE)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Order cells (after selecting a root interactively)
plot_cells(cds, color_cells_by = "Condition")  # to select root

cds <- order_cells(cds, root_pr_nodes = "Y_1")

# Create plots

# Plot pseudotime
p1 <- plot_cells(cds,
                 color_cells_by = "pseudotime",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 label_roots = TRUE,
                 cell_size = 0.8) +
  scale_color_viridis_c(option = "plasma", direction = -1) +
  ggtitle("Pseudotime")
## Cells aren't colored in a way that allows them to be grouped.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# Plot by cluster
p2 <- plot_cells(cds,
                 color_cells_by = "cluster",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = TRUE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 label_roots = TRUE,
                 cell_size = 0.8) +
  ggtitle("Cluster")

# Plot by sample group
p3 <- plot_cells(cds,
                 color_cells_by = "Condition",
                 show_trajectory_graph = TRUE,
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 label_roots = TRUE,
                 cell_size = 0.8) +
  ggtitle("Sample Group")

# Combine into panel
combined_plot <- (p1 | p2 | p3)

# Display
print(combined_plot)

Fig 3 C from the paper: I performed pseudotime analysis using Monocle3, initially across the entire cellular population before narrowing the analysis to the ductal cell subset. In the full dataset analysis, pseudotime trajectories captured broader differentiation paths across diverse cell types but displayed complex and intertwined branches, making it difficult to interpret ductal-specific progression. To focus specifically on ductal lineage dynamics, I subsetted the integrated dataset to include only ductal cells and reconstructed pseudotime trajectories within this compartment. The trajectory for ductal cells revealed two major disconnected manifolds: one corresponding predominantly to normal tissue (NT) cells and another enriched for primary tumor (PT) and metastasis (HM) cells. The pseudotime ordering spans from low values in the NT-dominated manifold to higher values in the tumor-enriched manifold, connected by an artificial long trajectory bridge between them. This pattern suggests transcriptional divergence between normal and malignant ductal cells, leading to structural separation in UMAP space. In contrast to the continuous tree-like trajectory observed in the paper, the trajectory reflected a more fragmented topology, potentially due to residual batch effects or inherent biological heterogeneity between normal and tumor ductal compartments. Similar molecular divergence between normal and malignant ductal cells has been reported in pancreatic cancer studies (Chan-Seng-Yue et al., 2020). Despite structural differences, pseudotime ordering recapitulated progressive transcriptional changes from normal-like to malignant ductal phenotypes, consistent with tumor progression.

Analysis 2: Cell type proportion analysis (Figure 1E)

library(dplyr)
library(ggplot2)

# Get metadata for ALL cells
meta <- integrated_obj_copy@meta.data

# Tabulate counts per cell type and condition
counts <- meta %>%
  group_by(Condition, Manual_Annotation) %>%
  summarize(n_cells = n()) %>%
  group_by(Condition) %>%
  mutate(percentage = n_cells / sum(n_cells) * 100)
## `summarise()` has grouped output by 'Condition'. You can override using the
## `.groups` argument.
# Stacked barplot
p2 <- ggplot(counts, aes(x = Condition, y = percentage, fill = Manual_Annotation)) +
  geom_bar(stat = "identity", color = "black") +
  theme_minimal() +
  ylab("Percent of cells per condition") +
  ggtitle("Cell type proportions across conditions") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Display plot
print(p2)

library(dplyr)
library(ggplot2)

counts <- integrated_obj_copy@meta.data %>%
  group_by(Condition, Manual_Annotation) %>%
  summarize(n_cells = n()) %>%
  group_by(Condition) %>%
  mutate(percentage = n_cells / sum(n_cells) * 100)
## `summarise()` has grouped output by 'Condition'. You can override using the
## `.groups` argument.
ggplot(counts, aes(x = percentage, y = Manual_Annotation, fill = Condition)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  xlab("% cells per type") +
  ggtitle("Cell type proportions across conditions")

Extra Analysis: Cell–Cell Communication Analysis (CellChat)

library(CellChat)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:monocle3':
## 
##     clusters
## The following objects are masked from 'package:future':
## 
##     %->%, %<-%
## The following object is masked from 'package:GenomicRanges':
## 
##     union
## The following object is masked from 'package:IRanges':
## 
##     union
## The following object is masked from 'package:S4Vectors':
## 
##     union
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, path, union
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following object is masked from 'package:Seurat':
## 
##     components
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(patchwork)
library(Seurat)
library(SeuratObject)
library(Matrix)

# Extract expression matrix
expr_matrix <- GetAssayData(integrated_obj_copy, slot = "data")

# Subset matrix to keep only positive values (replace negatives with small number)
expr_matrix@x[expr_matrix@x < 0] <- 0  # if sparse matrix
# Alternatively: expr_matrix[expr_matrix < 0] <- 0 if dense

# Remove rows (genes) with all zeros
nonzero_genes <- rowSums(expr_matrix) > 0
expr_matrix <- expr_matrix[nonzero_genes, ]

# Remove columns (cells) with all zeros
nonzero_cells <- colSums(expr_matrix) > 0
expr_matrix <- expr_matrix[, nonzero_cells]

# Check dimension again
cat("Filtered matrix dimensions:", dim(expr_matrix), "\n")
## Filtered matrix dimensions: 2000 55041
# Also filter metadata to match cells
meta_data <- integrated_obj_copy@meta.data
meta_data <- meta_data[colnames(expr_matrix), , drop = FALSE]
meta_data$labels <- meta_data$Manual_Annotation

cellchat <- createCellChat(object = expr_matrix, meta = meta_data, group.by = "labels")
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object 
## The cell groups used for CellChat analysis are  B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg
# Load database
cellchat@DB <- CellChatDB.human

# Subset signaling genes
cellchat <- subsetData(cellchat)

# Identify overexpressed genes/interactions
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)

# Compute communication probabilities
cellchat <- computeCommunProb(cellchat)
## triMean is used for calculating the average gene expression per cell group. 
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:31:01.116943]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:34:16.709327]"
cellchat <- filterCommunication(cellchat, min.cells = 10)
cellchat <- computeCommunProbPathway(cellchat)

# Aggregate network
cellchat <- aggregateNet(cellchat)
# Show overall interaction network
netVisual_circle(cellchat@net$count, vertex.weight = as.numeric(table(cellchat@idents)), weight.scale = TRUE, label.edge = FALSE)

# Heatmap of number of interactions
netVisual_heatmap(cellchat)
## Do heatmap based on a single object

I used CellChat (Jin et al., 2021) to infer global cell–cell communication networks across the annotated cell populations. This analysis revealed that ductal cells, myeloid cells, fibroblasts, and NKT cells formed dense interaction hubs, showing the highest number of predicted ligand–receptor interactions with other cell types. In contrast, endocrine and endothelial cells displayed fewer interactions, suggesting lower participation in signaling within the tumor microenvironment. Notably, communication between ductal and myeloid cells was highly enriched, consistent with reported roles of tumor-associated macrophages in promoting pancreatic cancer progression. The heatmap further highlighted robust bidirectional signaling between ductal cells and fibroblasts, implicating potential tumor–stroma crosstalk. This network-level analysis complements the cell type and pseudotime analyses by identifying key interacting populations and suggests that therapeutic targeting of stromal or immune-mediated interactions may disrupt pro-tumorigenic signaling circuits in pancreatic cancer.

Condition-Specific Cell–Cell Communication Analysis Using CellChat

integrated_obj_copy$Condition <- ifelse(grepl("case1_", integrated_obj_copy$orig.ident), "NT",
                                 ifelse(grepl("case2_", integrated_obj_copy$orig.ident), "PT",
                                 ifelse(grepl("case3_|case4_", integrated_obj_copy$orig.ident), "HM", NA)))
seurat_list <- SplitObject(integrated_obj_copy, split.by = "Condition")
cellchat_list <- list()

for (cond in names(seurat_list)) {
  cat("Processing condition:", cond, "\n")
  
  # Extract expression matrix
  expr_matrix <- GetAssayData(seurat_list[[cond]], slot = "data")
  expr_matrix@x[expr_matrix@x < 0] <- 0
  nonzero_genes <- rowSums(expr_matrix) > 0
  expr_matrix <- expr_matrix[nonzero_genes, ]
  nonzero_cells <- colSums(expr_matrix) > 0
  expr_matrix <- expr_matrix[, nonzero_cells]
  
  # Metadata
  meta_data <- seurat_list[[cond]]@meta.data
  meta_data <- meta_data[colnames(expr_matrix), , drop = FALSE]
  meta_data$labels <- meta_data$Manual_Annotation
  
  # Create CellChat object
  cellchat <- createCellChat(object = expr_matrix, meta = meta_data, group.by = "labels")
  cellchat@DB <- CellChatDB.human
  
  # Subset signaling genes
  cellchat <- subsetData(cellchat)
  
  # Overexpressed genes/interactions
  cellchat <- identifyOverExpressedGenes(cellchat)
  cellchat <- identifyOverExpressedInteractions(cellchat)
  
  # Compute communication
  cellchat <- computeCommunProb(cellchat)
  cellchat <- filterCommunication(cellchat, min.cells = 10)
  cellchat <- computeCommunProbPathway(cellchat)
  cellchat <- aggregateNet(cellchat)
  
  # Store in list
  cellchat_list[[cond]] <- cellchat
}
## Processing condition: NT 
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object 
## The cell groups used for CellChat analysis are  B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg 
## triMean is used for calculating the average gene expression per cell group. 
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:35:00.440166]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:36:33.707042]"
## Processing condition: PT 
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object 
## The cell groups used for CellChat analysis are  B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg 
## triMean is used for calculating the average gene expression per cell group. 
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:37:22.652678]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:39:15.912937]"
## Processing condition: HM 
## [1] "Create a CellChat object from a data matrix"
## Set cell identities for the new CellChat object 
## The cell groups used for CellChat analysis are  B cell Dendritic cell Ductal cell Endocrine cell Endothelial cell Fibroblast Mast cell MKI67+ ductal cell Myeloid cell NKT Plasma cell T cell Treg 
## triMean is used for calculating the average gene expression per cell group. 
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2025-05-07 23:39:40.062835]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2025-05-07 23:41:06.453488]"
# Plot heatmaps for each condition with custom titles
netVisual_heatmap(cellchat_list[["NT"]], measure = "count", title.name = "NT")
## Do heatmap based on a single object

netVisual_heatmap(cellchat_list[["PT"]], measure = "count", title.name = "PT")
## Do heatmap based on a single object

netVisual_heatmap(cellchat_list[["HM"]], measure = "count", title.name = "HM")
## Do heatmap based on a single object

In addition to the global cell–cell communication network, I stratified the analysis by condition to compare interaction patterns between normal tissue (NT), primary tumor (PT), and metastasis (HM). The heatmaps show the number of predicted ligand–receptor interactions from sender (rows) to receiver (columns) across each condition. Notably, fibroblasts, myeloid cells, and ductal cells exhibited higher interaction counts in primary tumor and metastasis relative to normal tissue, suggesting increased stromal and immune crosstalk in tumor and metastatic microenvironments. In contrast, endocrine and endothelial cells showed fewer interactions in tumor and metastasis, reflecting possible loss of normal tissue signaling. These patterns align with known roles of fibroblasts and tumor-associated macrophages in promoting pancreatic cancer progression through paracrine signaling. This condition-specific analysis highlights shifts in communication networks during tumor progression and metastasis, identifying fibroblast–ductal and myeloid–ductal interactions as potential contributors to tumor–stroma crosstalk.

Citation: Hosein, A. N., Huang, H., Wang, Z., Parmar, K., Du, W., Huang, J., Maitra, A., Olson, E., Verma, U., & Brekken, R. A. (2019). Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution. JCI Insight, 4(16). https://doi.org/10.1172/jci.insight.129212

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: AlmaLinux 8.10 (Cerulean Leopard)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS NETLIB;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] CellChat_1.6.1              bigmemory_4.6.4            
##  [3] igraph_2.1.4                SeuratWrappers_0.4.0       
##  [5] monocle3_1.3.7              patchwork_1.3.0            
##  [7] Matrix_1.7-0                celldex_1.13.3             
##  [9] SingleR_2.6.0               future_1.40.0              
## [11] SeuratDisk_0.0.0.9021       scDblFinder_1.18.0         
## [13] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [15] Biobase_2.64.0              GenomicRanges_1.56.0       
## [17] GenomeInfoDb_1.40.0         IRanges_2.38.0             
## [19] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [21] MatrixGenerics_1.16.0       matrixStats_1.5.0          
## [23] lubridate_1.9.3             forcats_1.0.0              
## [25] stringr_1.5.1               dplyr_1.1.4                
## [27] purrr_1.0.4                 readr_2.1.5                
## [29] tidyr_1.3.1                 tibble_3.2.1               
## [31] tidyverse_2.0.0             ggplot2_3.5.2              
## [33] Seurat_5.3.0                SeuratObject_5.1.0         
## [35] sp_2.2-0                   
## 
## loaded via a namespace (and not attached):
##   [1] R.methodsS3_1.8.2         dichromat_2.0-0.1        
##   [3] paws.storage_0.5.0        goftest_1.2-3            
##   [5] Biostrings_2.72.0         HDF5Array_1.32.0         
##   [7] vctrs_0.6.5               spatstat.random_3.3-3    
##   [9] shape_1.4.6.1             proxy_0.4-27             
##  [11] digest_0.6.37             png_0.1-8                
##  [13] registry_0.5-1            gypsum_1.0.0             
##  [15] ggrepel_0.9.6             deldir_2.0-4             
##  [17] parallelly_1.43.0         magick_2.8.3             
##  [19] MASS_7.3-60.2             reshape2_1.4.4           
##  [21] foreach_1.5.2             httpuv_1.6.16            
##  [23] withr_3.0.2               ggrastr_1.0.2            
##  [25] xfun_0.52                 ggpubr_0.6.0             
##  [27] survival_3.6-4            memoise_2.0.1            
##  [29] ggbeeswarm_0.7.2          systemfonts_1.2.3        
##  [31] GlobalOptions_0.1.2       ragg_1.4.0               
##  [33] zoo_1.8-14                pbapply_1.7-2            
##  [35] R.oo_1.27.1               KEGGREST_1.44.0          
##  [37] promises_1.3.2            httr_1.4.7               
##  [39] rstatix_0.7.2             restfulr_0.0.15          
##  [41] globals_0.17.0            fitdistrplus_1.2-2       
##  [43] rhdf5filters_1.16.0       rhdf5_2.48.0             
##  [45] rstudioapi_0.16.0         UCSC.utils_1.0.0         
##  [47] miniUI_0.1.2              generics_0.1.3           
##  [49] ggalluvial_0.12.5         curl_6.2.2               
##  [51] zlibbioc_1.50.0           ScaledMatrix_1.12.0      
##  [53] polyclip_1.10-7           GenomeInfoDbData_1.2.12  
##  [55] ExperimentHub_2.12.0      SparseArray_1.4.0        
##  [57] doParallel_1.0.17         xtable_1.8-4             
##  [59] evaluate_1.0.3            S4Arrays_1.4.0           
##  [61] BiocFileCache_2.12.0      hms_1.1.3                
##  [63] irlba_2.3.5.1             colorspace_2.1-0         
##  [65] filelock_1.0.3            hdf5r_1.3.12             
##  [67] ggnetwork_0.5.13          ROCR_1.0-11              
##  [69] reticulate_1.42.0         spatstat.data_3.1-6      
##  [71] magrittr_2.0.3            lmtest_0.9-40            
##  [73] later_1.4.2               viridis_0.6.5            
##  [75] lattice_0.22-6            NMF_0.28                 
##  [77] spatstat.geom_3.3-6       future.apply_1.11.3      
##  [79] scattermore_1.2           XML_3.99-0.18            
##  [81] scuttle_1.14.0            cowplot_1.1.3            
##  [83] RcppAnnoy_0.0.22          pillar_1.10.2            
##  [85] nlme_3.1-164              sna_2.7-2                
##  [87] iterators_1.0.14          gridBase_0.4-7           
##  [89] compiler_4.4.0            beachmat_2.20.0          
##  [91] RSpectra_0.16-2           stringi_1.8.7            
##  [93] tensor_1.5                minqa_1.2.8              
##  [95] GenomicAlignments_1.40.0  plyr_1.8.9               
##  [97] crayon_1.5.3              abind_1.4-8              
##  [99] BiocIO_1.14.0             scater_1.32.0            
## [101] locfit_1.5-9.9            bit_4.6.0                
## [103] codetools_0.2-20          textshaping_1.0.1        
## [105] BiocSingular_1.20.0       bslib_0.9.0              
## [107] alabaster.ranges_1.4.0    GetoptLong_1.0.5         
## [109] plotly_4.10.4             leidenbase_0.1.35        
## [111] mime_0.13                 splines_4.4.0            
## [113] circlize_0.4.16           paws.common_0.7.2        
## [115] Rcpp_1.0.14               fastDummies_1.7.5        
## [117] dbplyr_2.5.0              sparseMatrixStats_1.16.0 
## [119] knitr_1.50                blob_1.2.4               
## [121] utf8_1.2.5                clue_0.3-65              
## [123] BiocVersion_3.19.1        lme4_1.1-37              
## [125] listenv_0.9.1             DelayedMatrixStats_1.26.0
## [127] Rdpack_2.6.4              ggsignif_0.6.4           
## [129] statmod_1.5.0             svglite_2.1.3            
## [131] tzdb_0.5.0                network_1.18.2           
## [133] pkgconfig_2.0.3           tools_4.4.0              
## [135] cachem_1.1.0              rbibutils_2.3            
## [137] RSQLite_2.3.11            viridisLite_0.4.2        
## [139] DBI_1.2.3                 fastmap_1.2.0            
## [141] rmarkdown_2.29            scales_1.4.0             
## [143] grid_4.4.0                ica_1.0-3                
## [145] Rsamtools_2.20.0          broom_1.0.5              
## [147] AnnotationHub_3.12.0      sass_0.4.10              
## [149] coda_0.19-4.1             FNN_1.1.4.1              
## [151] BiocManager_1.30.25       dotCall64_1.2            
## [153] carData_3.0-5             RANN_2.6.2               
## [155] alabaster.schemas_1.4.0   farver_2.1.2             
## [157] reformulas_0.4.1          yaml_2.3.10              
## [159] rtracklayer_1.64.0        cli_3.6.5                
## [161] lifecycle_1.0.4           uwot_0.2.3               
## [163] backports_1.4.1           presto_1.0.0             
## [165] bluster_1.14.0            BiocParallel_1.38.0      
## [167] timechange_0.3.0          gtable_0.3.6             
## [169] rjson_0.2.23              ggridges_0.5.6           
## [171] progressr_0.15.1          parallel_4.4.0           
## [173] limma_3.60.0              jsonlite_2.0.0           
## [175] edgeR_4.2.0               RcppHNSW_0.6.0           
## [177] bitops_1.0-9              bigmemory.sri_0.1.8      
## [179] bit64_4.6.0-1             assertthat_0.2.1         
## [181] xgboost_1.7.7.1           Rtsne_0.17               
## [183] alabaster.matrix_1.4.0    spatstat.utils_3.1-3     
## [185] BiocNeighbors_1.22.0      jquerylib_0.1.4          
## [187] metapod_1.12.0            alabaster.se_1.4.0       
## [189] dqrng_0.4.1               spatstat.univar_3.1-2    
## [191] R.utils_2.13.0            lazyeval_0.2.2           
## [193] alabaster.base_1.4.0      shiny_1.10.0             
## [195] htmltools_0.5.8.1         sctransform_0.4.2        
## [197] rappdirs_0.3.3            glue_1.8.0               
## [199] spam_2.11-1               httr2_1.0.1              
## [201] XVector_0.44.0            RCurl_1.98-1.17          
## [203] scran_1.32.0              gridExtra_2.3            
## [205] boot_1.3-30               R6_2.6.1                 
## [207] labeling_0.4.3            rngtools_1.5.2           
## [209] cluster_2.1.6             Rhdf5lib_1.26.0          
## [211] statnet.common_4.9.0      nloptr_2.2.1             
## [213] DelayedArray_0.30.0       tidyselect_1.2.1         
## [215] vipor_0.4.7               car_3.1-2                
## [217] AnnotationDbi_1.66.0      rsvd_1.0.5               
## [219] KernSmooth_2.23-22        data.table_1.17.0        
## [221] ComplexHeatmap_2.20.0     htmlwidgets_1.6.4        
## [223] RColorBrewer_1.1-3        rlang_1.1.6              
## [225] spatstat.sparse_3.1-0     spatstat.explore_3.4-2   
## [227] uuid_1.2-1                remotes_2.5.0            
## [229] Cairo_1.6-2               beeswarm_0.4.0